#!/usr/bin/env python
# -*- coding: utf-8 -*-
# 
# Copyright (c) 2017 Baidu.com, Inc. All Rights Reserved
# 

"""
File: unit5.py
Author: zhangyang(zhangyang40@baidu.com)
Date: 2018/2/5 下午2:21
"""
import warnings

warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
from gensim.corpora import MmCorpus, Dictionary
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models import lsimodel

if __name__ == '__main__':
    wiki = WikiCorpus('data/enwiki-20100622-pages-articles.xml.bz2')
    MmCorpus.serialize('data/wiki_en_tfidf.mm', wiki)
    wiki.dictionary.save_as_text('data/wiki_en_wordids.txt')
    id2word = Dictionary.load_from_text('data/wiki_en_wordids.txt')
    mm = MmCorpus("data/wiki_en_tfidf.mm")
    # mm = MmCorpus(bz2.BZ2File('wiki_en_tfidf.mm.bz2'))
    lsi = lsimodel.LsiModel(corpus=mm, id2word=id2word, num_topics=400)
